{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"news similarity-punc-5th-1021.ipynb","provenance":[],"collapsed_sections":[]},"interpreter":{"hash":"3bfce0b4c492a35815b5705a19fe374a7eea0baaa08b34d90450caf1fe9ce20b"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.3"},"widgets":{"application/vnd.jupyter.widget-state+json":{"6ff37bfc03874d9298e5a4a3baf1c0c4":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_b390f2fbec504a5184cd2db7dc1e41a7","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_d049ea5cc1f344f6ae2dc05ca60f5d6e","IPY_MODEL_ac47b5ddd6604956a3599d6bef31fc8d","IPY_MODEL_49b533c3f31e42659021e1e88476272e"]}},"b390f2fbec504a5184cd2db7dc1e41a7":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"d049ea5cc1f344f6ae2dc05ca60f5d6e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_181c08e96fdf4134a05910d16e92a163","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":"100%","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_d316ad3e386646f4aa49039dd2c53e12"}},"ac47b5ddd6604956a3599d6bef31fc8d":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_view_name":"ProgressView","style":"IPY_MODEL_d72821bcf2834bc58b40b7ec19bbcef6","_dom_classes":[],"description":"","_model_name":"FloatProgressModel","bar_style":"success","max":250,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":250,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_e3765dd6f52440eda0a6ed9c129f87bc"}},"49b533c3f31e42659021e1e88476272e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_fdb5b0fc3b894ab6b21722f594ffc50e","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 250/250 [06:11&lt;00:00,  1.44s/ba]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_16b2f413cbfa45d0a142d7d600c8dcb8"}},"181c08e96fdf4134a05910d16e92a163":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"d316ad3e386646f4aa49039dd2c53e12":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"d72821bcf2834bc58b40b7ec19bbcef6":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"e3765dd6f52440eda0a6ed9c129f87bc":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"fdb5b0fc3b894ab6b21722f594ffc50e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"16b2f413cbfa45d0a142d7d600c8dcb8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"8c79828d49c34c3b92d18679fa993353":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_950166f5481b4681a96b873abeab2709","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_7d9bb7e77e444931a40ee6a416c2a21e","IPY_MODEL_43a50dd7d444428f80915f26b27285e5","IPY_MODEL_334bf5561de4411a987e8465be1afffa"]}},"950166f5481b4681a96b873abeab2709":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"7d9bb7e77e444931a40ee6a416c2a21e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_947000b23da8462986a943a8331d1f5b","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":"Downloading: ","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_feafb5db517743f18fb3989dfef79f80"}},"43a50dd7d444428f80915f26b27285e5":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_view_name":"ProgressView","style":"IPY_MODEL_e89eff4ae5d543428e510ece1db61391","_dom_classes":[],"description":"","_model_name":"FloatProgressModel","bar_style":"success","max":1420,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":1420,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_548178697c5e4dc5997ae69c1a99034d"}},"334bf5561de4411a987e8465be1afffa":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_15c7fde56a7842a4bfdcbdc44fa4d44d","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 3.20k/? [00:00&lt;00:00, 73.5kB/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_443b04ba86804667a67489951ed56fd1"}},"947000b23da8462986a943a8331d1f5b":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"feafb5db517743f18fb3989dfef79f80":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"e89eff4ae5d543428e510ece1db61391":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"548178697c5e4dc5997ae69c1a99034d":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"15c7fde56a7842a4bfdcbdc44fa4d44d":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"443b04ba86804667a67489951ed56fd1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"0e4968e23ded41d1a171f1c18efa6177":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_07800877139e42b9b3284777cccb0460","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_6645bd9e42b4418daf97b9ecf797f486","IPY_MODEL_47c464dedc7640dda10164448c75745c","IPY_MODEL_0c3da98a951348ffa0f59298d8dda1dd"]}},"07800877139e42b9b3284777cccb0460":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"6645bd9e42b4418daf97b9ecf797f486":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_54adc81b8b58437b9826d6021e0be520","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":"100%","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_28631787f8134140883bce8838773391"}},"47c464dedc7640dda10164448c75745c":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_view_name":"ProgressView","style":"IPY_MODEL_b7152b6b0fbf4ceabc0ca4236443c2b3","_dom_classes":[],"description":"","_model_name":"FloatProgressModel","bar_style":"success","max":180,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":180,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_0935e6ebe9384131ba51f3ea3f22a69b"}},"0c3da98a951348ffa0f59298d8dda1dd":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_b9701a55c7c44ddd8e6cef8aa2b6d7d3","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 180/180 [02:10&lt;00:00,  1.40ba/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_99873e3d0ba94e7399689ef99c6e874c"}},"54adc81b8b58437b9826d6021e0be520":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"28631787f8134140883bce8838773391":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"b7152b6b0fbf4ceabc0ca4236443c2b3":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"0935e6ebe9384131ba51f3ea3f22a69b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"b9701a55c7c44ddd8e6cef8aa2b6d7d3":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"99873e3d0ba94e7399689ef99c6e874c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"592936b58cd04038980f56fc9b3f4b35":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_2d27c6f40d204474a8c3c35eeec8fd93","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_c4ee821e1d2348dfa52566c111038870","IPY_MODEL_6bf6c9e2df064a1ca6b64e4f324a4a51","IPY_MODEL_cdc7cd40020f458da05a96682506e33f"]}},"2d27c6f40d204474a8c3c35eeec8fd93":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"c4ee821e1d2348dfa52566c111038870":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_9a2fa7ff8ffb417da6e68da78b478ace","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":"100%","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_d543aecf8dd74a1c9a0c011ecd965e89"}},"6bf6c9e2df064a1ca6b64e4f324a4a51":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_view_name":"ProgressView","style":"IPY_MODEL_ca523571e3ca4091b66f6c6a6ebe3cd6","_dom_classes":[],"description":"","_model_name":"FloatProgressModel","bar_style":"success","max":20,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":20,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_fc0e8a550e3b445087574b248f018d0b"}},"cdc7cd40020f458da05a96682506e33f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_430e0f3b29f44eb6bef5e8985f3ed767","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 20/20 [00:14&lt;00:00,  1.37ba/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_ee8181374eac405da4d38e54975bdfbd"}},"9a2fa7ff8ffb417da6e68da78b478ace":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"d543aecf8dd74a1c9a0c011ecd965e89":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"ca523571e3ca4091b66f6c6a6ebe3cd6":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"fc0e8a550e3b445087574b248f018d0b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"430e0f3b29f44eb6bef5e8985f3ed767":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"ee8181374eac405da4d38e54975bdfbd":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"0c4490aebf584dfd82d892b6d6595a54":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_95753358fe9748828b0293744ffd1cb2","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_a4b51273f8bb4989998f42795db7921c","IPY_MODEL_59b53457765c437fa28e658c2af83817","IPY_MODEL_9eb9d23d3d3c446d9022c83e7a01096b"]}},"95753358fe9748828b0293744ffd1cb2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"a4b51273f8bb4989998f42795db7921c":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_71fe630676654933a3eab241b26cb215","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":"100%","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_497dea9e74de49abae0088adf0ab262a"}},"59b53457765c437fa28e658c2af83817":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_view_name":"ProgressView","style":"IPY_MODEL_75c2a7e9bbdf49139613ddb2c55281fb","_dom_classes":[],"description":"","_model_name":"FloatProgressModel","bar_style":"success","max":50,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":50,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_71c6f45697334b269de8dd9121344c4c"}},"9eb9d23d3d3c446d9022c83e7a01096b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_view_name":"HTMLView","style":"IPY_MODEL_9ecdfb7a5b6141df93820b9fe4a00948","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 50/50 [00:37&lt;00:00,  1.34ba/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_3c771948af6e4c2cb4976875e6750818"}},"71fe630676654933a3eab241b26cb215":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"497dea9e74de49abae0088adf0ab262a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"75c2a7e9bbdf49139613ddb2c55281fb":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"71c6f45697334b269de8dd9121344c4c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"9ecdfb7a5b6141df93820b9fe4a00948":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"3c771948af6e4c2cb4976875e6750818":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}}}}},"cells":[{"cell_type":"markdown","metadata":{"id":"jZ3H30DSeoXw"},"source":[""]},{"cell_type":"code","metadata":{"id":"s7Fwcp5CLXtg","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1634911486906,"user_tz":-480,"elapsed":31567,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"d6a70bb7-6d21-4ce4-e272-9ad4e86bd75f"},"source":["from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","metadata":{"id":"SjDdD4vHN1D6"},"source":["import os\n","os.chdir('/content/drive/MyDrive/transformers/天池-入门NLP - 新闻文本分类')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"laWQqNegpwhR"},"source":["#安装\n","!pip install transformers datasets"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"-tf3bg_DPfgx"},"source":["# 文件读取\n","import pandas as pd\n","from datasets import load_dataset\n","from datasets import Dataset\n","\n","train_df=pd.read_csv('./train_set.csv',sep='\\t')\n","test_df=pd.read_csv('./test_a.csv', sep ='\\t')\n","df=pd.concat((train_df,test_df))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"TnHEQBaqTLIc","colab":{"base_uri":"https://localhost:8080/","height":52},"executionInfo":{"status":"ok","timestamp":1634911537023,"user_tz":-480,"elapsed":453,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"e4c23709-e858-4c73-ee18-f88ea75ee5e1"},"source":["#将3750/648/900改成标点符号，删除原text列，新增列重名为text列\n","import re\n","def replacepunc(x):\n","  x=re.sub('3750',\",\",x)\n","  x=re.sub('900',\".\",x)\n","  x=re.sub('648',\"!\",x)\n","  return x\n","\"\"\"\n","df['words']=df['text'].map(lambda x: replacepunc(x))\n","df.drop('text',axis=1,inplace=True)\n","df.columns=['label','text']\n","\n","#数据载入dataset，去除多余的列，只保留text列\n","data=Dataset.from_pandas(df).remove_columns(['label', '__index_level_0__'])\n","data\"\"\""],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["\"\\ndf['words']=df['text'].map(lambda x: replacepunc(x))\\ndf.drop('text',axis=1,inplace=True)\\ndf.columns=['label','text']\\n\\n#数据载入dataset，去除多余的列，只保留text列\\ndata=Dataset.from_pandas(df).remove_columns(['label', '__index_level_0__'])\\ndata\""]},"metadata":{},"execution_count":6}]},{"cell_type":"code","metadata":{"id":"Vs1cPtqjpSDQ"},"source":["batch_size=1000\n","#all_texts=[data['text'][i:i+batch_size] for i in range(0,len(data),batch_size)]\n","\n","def batch_iterator():\n","  for i in range(0,len(data),batch_size):\n","    yield data['text'][i:i+batch_size]"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"x49jbjyuo5Ai"},"source":["#初始化分词器、预分词器\n","from tokenizers import decoders,models,normalizers,pre_tokenizers,processors,trainers,Tokenizer\n","\n","tokenizer=Tokenizer(models.WordPiece(unl_token=\"[UNK]\"))\n","\n","tokenizer.pre_tokenizer=pre_tokenizers.BertPreTokenizer()\n","special_tokens=[\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]\n","trainer=trainers.WordPieceTrainer(vocab_size=7000,min_frequency=2,special_tokens=[\"[UNK]\",\"[CLS]\",\"[SEP]\",\"[PAD]\",\"[MASK]\"])\n","tokenizer.decoders=decoders.WordPiece(prefix=\"##\")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"6ZY816xowl81"},"source":["#开始训练\n","tokenizer.train_from_iterator(batch_iterator(),trainer=trainer)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"vDGR8jOrwrRh"},"source":["#进行分词后处理\n","cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n","sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n","mask_token_id = tokenizer.token_to_id(\"[MASK]\")\n","pad_token_id = tokenizer.token_to_id(\"[PAD]\")\n","\n","tokenizer.post_processor = processors.TemplateProcessing(\n","    single=f\"[CLS]:0 $A:0 [SEP]:0\",\n","    pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n","    special_tokens=[(\"[CLS]\",cls_token_id),(\"[SEP]\",sep_token_id),(\"[MASK]\",mask_token_id)],\n","    )\n","\n","tokenizer.enable_truncation(max_length=512)\n","tokenizer.enable_padding(pad_token='[PAD]')\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"pnXOenT-wz3M"},"source":["#测试分词结果\n","encoding = tokenizer.encode('2491 4109 1757 7539 648 3695 3038 4490 23 7019 3731 4109 3792 2465',' 2893 7212 5296 1667 3618 7044 1519 5413 1283 6122 4893 7495 2435 5510')\n","encoding.tokens"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"JYx0116UxIgj"},"source":["#保存模型并重新加载\n","#tokenizer.save(\"tokenizers.json\")\n","\n","from transformers import PreTrainedTokenizerFast\n","fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=\"tokenizers.json\",\n","   model_max_length=512,mask_token='[MASK]',pad_token='[PAD]',unk_token='[UNK]',\n","   cls_token='[CLS]',sep_token='[SEP]',padding_side='right',\n","   return_special_tokens_mask=True)\n","#PreTrainedTokenizerFast中一定要设置mask_token，pad_token等，不然报错"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"6M5J86LWIMKX"},"source":["#data_collator是一个函数，负责获取样本并将它们批处理成张量\n","#在data_collator中可以确保每次以新的方式完成随机掩蔽。\n","from transformers import DataCollatorForLanguageModeling\n","data_collator=DataCollatorForLanguageModeling(tokenizer=fast_tokenizer,mlm=True,mlm_probability=0.15)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"f0f9hRCJWA6u"},"source":["#加载训练中保存的模型，继续训练\n","from transformers import BertForMaskedLM\n","model = BertForMaskedLM.from_pretrained(\"./Pre_Bert\")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"jHl6Ru5sxU62"},"source":["#初始化bert模型\n","from transformers import BertConfig\n","config = BertConfig(\n","    vocab_size=7000,\n","    hidden_size=512,\n","    intermediate_size=4*512,\n","    max_position_embeddings=512,\n","    num_hidden_layers=4,\n","    num_attention_heads=4,\n","    type_vocab_size=2,\n","    attention_probs_dropout_prob=0.1,\n","    hidden_dropout_prob=0.1,\n","    initializer_range=0.02\n",")\n","\n","from transformers import BertForMaskedLM\n","model = BertForMaskedLM(config=config)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"7JSzZ9jbk-o5","colab":{"base_uri":"https://localhost:8080/","height":116,"referenced_widgets":["6ff37bfc03874d9298e5a4a3baf1c0c4","b390f2fbec504a5184cd2db7dc1e41a7","d049ea5cc1f344f6ae2dc05ca60f5d6e","ac47b5ddd6604956a3599d6bef31fc8d","49b533c3f31e42659021e1e88476272e","181c08e96fdf4134a05910d16e92a163","d316ad3e386646f4aa49039dd2c53e12","d72821bcf2834bc58b40b7ec19bbcef6","e3765dd6f52440eda0a6ed9c129f87bc","fdb5b0fc3b894ab6b21722f594ffc50e","16b2f413cbfa45d0a142d7d600c8dcb8"]},"executionInfo":{"status":"ok","timestamp":1634734683850,"user_tz":-480,"elapsed":373022,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"a2c134c2-43f7-404b-bae1-c7c11ece5c52"},"source":["#数据进行分词预处理，删除‘text'列，否则后面拼接的时候会报错。\n","tokenized_datasets=data.map(lambda examples:fast_tokenizer(examples['text']),batched=True).remove_columns(\"text\")\n","tokenized_datasets"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6ff37bfc03874d9298e5a4a3baf1c0c4","version_minor":0,"version_major":2},"text/plain":["  0%|          | 0/250 [00:00<?, ?ba/s]"]},"metadata":{}},{"output_type":"execute_result","data":{"text/plain":["Dataset({\n","    features: ['attention_mask', 'input_ids', 'text', 'token_type_ids'],\n","    num_rows: 250000\n","})"]},"metadata":{},"execution_count":100}]},{"cell_type":"code","metadata":{"id":"AuMmpZ2LIwy8"},"source":["block_size = 128\n","def group_texts(examples):\n","  # 拼接所有文本\n","  concatenated_examples={k:sum(examples[k],[]) for k in examples.keys()}\n","  total_length=len(concatenated_examples[list(examples.keys())[0]])\n","  # 我们将余数对应的部分去掉。但如果模型支持的话，可以添加padding，您可以根据需要定制此部件。\n","  total_length = (total_length//block_size)*block_size\n","  # 通过max_len进行分割。\n","  result={\n","      k:[t[i:i+block_size] for i in range(0,total_length,block_size)]\n","      for k,t in concatenated_examples.items()\n","  }\n","  result[\"labels\"]=result[\"input_ids\"].copy()\n","  return result\n","\n","lm_datasets=tokenized_datasets.map(\n","    group_texts,\n","    batched=True,\n","    batch_size=1000,\n","    num_proc=4,\n",")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"UVJCAvvLXU3r"},"source":["#加载和保存拼接后的文本\n","#lm_datasets.save_to_disk('./lm_datasets')\n","\n","import pandas as pd\n","from datasets import load_from_disk\n","lm_datasets=load_from_disk('./lm_datasets')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"jC2-OjE02Zc7","colab":{"base_uri":"https://localhost:8080/","height":102},"executionInfo":{"status":"ok","timestamp":1634818037280,"user_tz":-480,"elapsed":688,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"25ae24fc-3971-4a8a-b937-319cdd08107a"},"source":["#解码分词器预处理的lm_datasets数据，里面有标点符号\n","la=fast_tokenizer.decode(lm_datasets[0]['input_ids'])\n","la"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'[CLS] 2967 6758 339 2021 1854 3731 4109 3792 4149 1519 2058 3912 2465 2410 1219 6654 7539 264 2456 4811 1292 2109 6905 5520 7058 6045 3634 6591 3530 6508 2465 7044 1519 3659 2073, 3731 4109 3792 6831 2614 3370 4269 3370 486 5770 4109 4125, 5445 2466 6831 6758 3743 3630 1726 2313 5906 826 4516 657. 1871 7044, 2967 3731 1757 1939! 2828 4704 7039 3706, 965 2490 7399 3743 2145 2407 7451 3775 6017 5998 1641 299 4704 2621 7029 3056 6333 433! 1667 1099. 2289 1099! 5780 220 7044 1279 7426 4269, 2967 6758 6631 3099 2205 7305 2620 5977, 3329 1793 6666 2042 3193 4149 1519 7039 3706 2446 5399'"]},"metadata":{},"execution_count":11}]},{"cell_type":"code","metadata":{"id":"vBQtcuRB_miY"},"source":["import os\n","assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'\n","\n","!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"irCXJFhbCPF_"},"source":["#模型加载到TPU\n","import torch_xla.core.xla_model as xm\n","device = xm.xla_device()\n","model.to(device)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"28pLk7dUPLDF"},"source":["#使用GPU训练\n","import torch\n","device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n","model.to(device)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"X2NJr93zI8Wo"},"source":["from transformers import Trainer, TrainingArguments\n","training_args = TrainingArguments(\n","    \"pre-mlm\",\n","    logging_strategy=\"steps\",\n","    logging_steps=3000,\n","    save_strategy=\"steps\",\n","    save_steps=10000,\n","    num_train_epochs=3,\n","    learning_rate=2e-4,\n","    per_device_train_batch_size=128,\n","    weight_decay=0.01\n",")\n","\n","trainer = Trainer(\n","    model=model,\n","    args=training_args,\n","    train_dataset=lm_datasets,\n","    data_collator=data_collator)\n","#lr=4e-4，跑了5个epoch后loss=1.695。第二天接着跑，lr=2e-4，steps=3000时，loss=1.784。\n","#说明模型训练一定次数后loss离最小点更近，还用原学习率会震荡。"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Qlc3csuSJd6g","colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"status":"ok","timestamp":1634828177742,"user_tz":-480,"elapsed":8045508,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"5821d087-c28d-408d-f500-b48baa7677e1"},"source":["trainer.train()"],"execution_count":null,"outputs":[{"metadata":{"tags":null},"name":"stderr","output_type":"stream","text":["***** Running training *****\n","  Num examples = 1779535\n","  Num Epochs = 3\n","  Instantaneous batch size per device = 128\n","  Total train batch size (w. parallel, distributed & accumulation) = 128\n","  Gradient Accumulation steps = 1\n","  Total optimization steps = 41709\n"]},{"data":{"text/html":["\n","    <div>\n","      \n","      <progress value='7971' max='41709' style='width:300px; height:20px; vertical-align: middle;'></progress>\n","      [ 7971/41709 30:41 < 2:09:57, 4.33 it/s, Epoch 0.57/3]\n","    </div>\n","    <table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: left;\">\n","      <th>Step</th>\n","      <th>Training Loss</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <td>3000</td>\n","      <td>1.783800</td>\n","    </tr>\n","    <tr>\n","      <td>6000</td>\n","      <td>1.780700</td>\n","    </tr>\n","  </tbody>\n","</table><p>"],"text/plain":["<IPython.core.display.HTML object>"]},"metadata":{},"output_type":"display_data"},{"output_type":"display_data","data":{"text/html":["\n","    <div>\n","      \n","      <progress value='41709' max='41709' style='width:300px; height:20px; vertical-align: middle;'></progress>\n","      [41709/41709 2:44:48, Epoch 3/3]\n","    </div>\n","    <table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: left;\">\n","      <th>Step</th>\n","      <th>Training Loss</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <td>3000</td>\n","      <td>1.783800</td>\n","    </tr>\n","    <tr>\n","      <td>6000</td>\n","      <td>1.780700</td>\n","    </tr>\n","    <tr>\n","      <td>9000</td>\n","      <td>1.771600</td>\n","    </tr>\n","    <tr>\n","      <td>12000</td>\n","      <td>1.753900</td>\n","    </tr>\n","    <tr>\n","      <td>15000</td>\n","      <td>1.739000</td>\n","    </tr>\n","    <tr>\n","      <td>18000</td>\n","      <td>1.720800</td>\n","    </tr>\n","    <tr>\n","      <td>21000</td>\n","      <td>1.706600</td>\n","    </tr>\n","    <tr>\n","      <td>24000</td>\n","      <td>1.690300</td>\n","    </tr>\n","    <tr>\n","      <td>27000</td>\n","      <td>1.676500</td>\n","    </tr>\n","    <tr>\n","      <td>30000</td>\n","      <td>1.663200</td>\n","    </tr>\n","    <tr>\n","      <td>33000</td>\n","      <td>1.647300</td>\n","    </tr>\n","    <tr>\n","      <td>36000</td>\n","      <td>1.638600</td>\n","    </tr>\n","    <tr>\n","      <td>39000</td>\n","      <td>1.629500</td>\n","    </tr>\n","  </tbody>\n","</table><p>"],"text/plain":["<IPython.core.display.HTML object>"]},"metadata":{}},{"output_type":"stream","name":"stderr","text":["Saving model checkpoint to pre-mlm/checkpoint-10000\n","Configuration saved in pre-mlm/checkpoint-10000/config.json\n","Model weights saved in pre-mlm/checkpoint-10000/pytorch_model.bin\n","Saving model checkpoint to pre-mlm/checkpoint-20000\n","Configuration saved in pre-mlm/checkpoint-20000/config.json\n","Model weights saved in pre-mlm/checkpoint-20000/pytorch_model.bin\n","Saving model checkpoint to pre-mlm/checkpoint-30000\n","Configuration saved in pre-mlm/checkpoint-30000/config.json\n","Model weights saved in pre-mlm/checkpoint-30000/pytorch_model.bin\n","Saving model checkpoint to pre-mlm/checkpoint-40000\n","Configuration saved in pre-mlm/checkpoint-40000/config.json\n","Model weights saved in pre-mlm/checkpoint-40000/pytorch_model.bin\n","\n","\n","Training completed. Do not forget to share your model on huggingface.co/models =)\n","\n","\n"]},{"output_type":"execute_result","data":{"text/plain":["TrainOutput(global_step=41709, training_loss=1.702359629179254, metrics={'train_runtime': 9891.1282, 'train_samples_per_second': 539.737, 'train_steps_per_second': 4.217, 'total_flos': 5.28424108839936e+16, 'train_loss': 1.702359629179254, 'epoch': 3.0})"]},"metadata":{},"execution_count":14}]},{"cell_type":"code","metadata":{"id":"LRlBCECYMLHv","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1634828192057,"user_tz":-480,"elapsed":10970,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"dcc2a44d-a9cc-4dd5-82f1-95ea9a86a02a"},"source":["#保存模型\n","trainer.save_model(\"./pre_Bert\")"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["Saving model checkpoint to ./pre_Bert\n","Configuration saved in ./pre_Bert/config.json\n","Model weights saved in ./pre_Bert/pytorch_model.bin\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["8c79828d49c34c3b92d18679fa993353","950166f5481b4681a96b873abeab2709","7d9bb7e77e444931a40ee6a416c2a21e","43a50dd7d444428f80915f26b27285e5","334bf5561de4411a987e8465be1afffa","947000b23da8462986a943a8331d1f5b","feafb5db517743f18fb3989dfef79f80","e89eff4ae5d543428e510ece1db61391","548178697c5e4dc5997ae69c1a99034d","15c7fde56a7842a4bfdcbdc44fa4d44d","443b04ba86804667a67489951ed56fd1"]},"id":"4hXP70j0CeHk","executionInfo":{"status":"ok","timestamp":1634907187100,"user_tz":-480,"elapsed":2196,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"64b441a9-d2eb-44f3-ca2f-8b443355a60c"},"source":["#准备进行下游任务微调\n","from datasets import load_metric\n","metric=load_metric(\"accuracy\")"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8c79828d49c34c3b92d18679fa993353","version_minor":0,"version_major":2},"text/plain":["Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]"]},"metadata":{}}]},{"cell_type":"code","metadata":{"id":"klkB8nUsVcIQ"},"source":["#加载训练好的预训练模型\n","from transformers import AutoModelForSequenceClassification\n","model=AutoModelForSequenceClassification.from_pretrained(\"./news-classification/checkpoint-5625\",num_labels=14)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"wknpeP5gciD9"},"source":["#TPU训练\n","import torch_xla.core.xla_model as xm\n","device = xm.xla_device()\n","model.to(device)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"oOfSgLHtEA04"},"source":["import numpy as np\n","def compute_metrics(eval_pred):\n","  predictions, labels = eval_pred\n","  predictions = np.argmax(predictions, axis=1)\n","  \n","  return metric.compute(predictions=predictions, references=labels)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"kkBEFd1NejSu"},"source":["#加载数据\n","\n","from datasets import Dataset\n","import pandas as pd\n","train_df=pd.read_csv('./train_set.csv',sep='\\t').sample(frac=1)\n","\n","#将训练数据中三个token换成标点\n","train_df['texts']=train_df['text'].map(lambda x:replacepunc(x))\n","\n","\n","#准备将text文本首尾截断，各取255tokens\n","def slipt2(x):\n","  ls=x.split(' ')\n","  le=len(ls)\n","  if le<511:\n","    return x\n","  else:\n","    return ' '.join(ls[:255]+ls[-255:])    "],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"lK3Hbdi2G2t2","colab":{"base_uri":"https://localhost:8080/","height":350,"referenced_widgets":["0e4968e23ded41d1a171f1c18efa6177","07800877139e42b9b3284777cccb0460","6645bd9e42b4418daf97b9ecf797f486","47c464dedc7640dda10164448c75745c","0c3da98a951348ffa0f59298d8dda1dd","54adc81b8b58437b9826d6021e0be520","28631787f8134140883bce8838773391","b7152b6b0fbf4ceabc0ca4236443c2b3","0935e6ebe9384131ba51f3ea3f22a69b","b9701a55c7c44ddd8e6cef8aa2b6d7d3","99873e3d0ba94e7399689ef99c6e874c","592936b58cd04038980f56fc9b3f4b35","2d27c6f40d204474a8c3c35eeec8fd93","c4ee821e1d2348dfa52566c111038870","6bf6c9e2df064a1ca6b64e4f324a4a51","cdc7cd40020f458da05a96682506e33f","9a2fa7ff8ffb417da6e68da78b478ace","d543aecf8dd74a1c9a0c011ecd965e89","ca523571e3ca4091b66f6c6a6ebe3cd6","fc0e8a550e3b445087574b248f018d0b","430e0f3b29f44eb6bef5e8985f3ed767","ee8181374eac405da4d38e54975bdfbd"]},"executionInfo":{"status":"ok","timestamp":1634907522720,"user_tz":-480,"elapsed":157151,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"3fef579c-c9af-4866-eb4b-b912f611b62f"},"source":["#划分训练集和测试集\n","\n","val_df=train_df.iloc[:20000, ]\n","trains_df=train_df.iloc[20000:,]\n","\n","#首尾截断\n","val_df['summary']=val_df['texts'].apply(lambda x:slipt2(x))\n","trains_df['summary']=trains_df['texts'].apply(lambda x:slipt2(x))\n","\n","#加载到dataset并预处理\n","trains_ds=Dataset.from_pandas(trains_df).remove_columns([\"texts\",\"text\"])\n","val_ds=Dataset.from_pandas(val_df).remove_columns([\"texts\",\"text\"])\n","\n","tokenized_trains_ds=trains_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)\n","tokenized_val_ds=val_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)\n","tokenized_val_ds"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n","  import sys\n","/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n","  \n"]},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"0e4968e23ded41d1a171f1c18efa6177","version_minor":0,"version_major":2},"text/plain":["  0%|          | 0/180 [00:00<?, ?ba/s]"]},"metadata":{}},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"592936b58cd04038980f56fc9b3f4b35","version_minor":0,"version_major":2},"text/plain":["  0%|          | 0/20 [00:00<?, ?ba/s]"]},"metadata":{}},{"output_type":"execute_result","data":{"text/plain":["Dataset({\n","    features: ['__index_level_0__', 'attention_mask', 'input_ids', 'label', 'summary', 'token_type_ids'],\n","    num_rows: 20000\n","})"]},"metadata":{},"execution_count":14}]},{"cell_type":"code","metadata":{"id":"79vaC6bwzOeZ"},"source":["#进行任务微调\n","from transformers import TrainingArguments,Trainer\n","args=TrainingArguments(\n","  output_dir='news-classification-2',\n","  evaluation_strategy=\"epoch\",\n","  save_strategy=\"epoch\",\n","  learning_rate=2e-5,\n","  per_device_train_batch_size=96,\n","  per_device_eval_batch_size=96,\n","  num_train_epochs=3,\n","  weight_decay=0.01,\n","  load_best_model_at_end=True,\n","  metric_for_best_model=\"accuracy\")\n","\n","trainer=Trainer(\n","  model,\n","  args,\n","  train_dataset=tokenized_trains_ds,\n","  eval_dataset=tokenized_val_ds,\n","  tokenizer=fast_tokenizer,\n","  compute_metrics=compute_metrics)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"y3XBXO18faB5","colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"status":"ok","timestamp":1634910356788,"user_tz":-480,"elapsed":758571,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"1fc5491c-51a4-43bf-bd89-16783d652466"},"source":["trainer.train()\n","#trainer.save_model(\"./finally_bert\")"],"execution_count":null,"outputs":[{"metadata":{"tags":null},"name":"stderr","output_type":"stream","text":["The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.\n","***** Running training *****\n","  Num examples = 180000\n","  Num Epochs = 3\n","  Instantaneous batch size per device = 96\n","  Total train batch size (w. parallel, distributed & accumulation) = 96\n","  Gradient Accumulation steps = 1\n","  Total optimization steps = 5625\n"]},{"data":{"text/html":["\n","    <div>\n","      \n","      <progress value='4017' max='5625' style='width:300px; height:20px; vertical-align: middle;'></progress>\n","      [4017/5625 31:42 < 12:42, 2.11 it/s, Epoch 2.14/3]\n","    </div>\n","    <table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: left;\">\n","      <th>Epoch</th>\n","      <th>Training Loss</th>\n","      <th>Validation Loss</th>\n","      <th>Accuracy</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <td>1</td>\n","      <td>0.112900</td>\n","      <td>0.097082</td>\n","      <td>0.969800</td>\n","    </tr>\n","    <tr>\n","      <td>2</td>\n","      <td>0.092900</td>\n","      <td>0.090939</td>\n","      <td>0.971050</td>\n","    </tr>\n","  </tbody>\n","</table><p>"],"text/plain":["<IPython.core.display.HTML object>"]},"metadata":{},"output_type":"display_data"},{"metadata":{"tags":null},"name":"stderr","output_type":"stream","text":["The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.\n","***** Running Evaluation *****\n","  Num examples = 20000\n","  Batch size = 96\n","Saving model checkpoint to news-classification-2/checkpoint-1875\n","Configuration saved in news-classification-2/checkpoint-1875/config.json\n","Model weights saved in news-classification-2/checkpoint-1875/pytorch_model.bin\n","tokenizer config file saved in news-classification-2/checkpoint-1875/tokenizer_config.json\n","Special tokens file saved in news-classification-2/checkpoint-1875/special_tokens_map.json\n","The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.\n","***** Running Evaluation *****\n","  Num examples = 20000\n","  Batch size = 96\n","Saving model checkpoint to news-classification-2/checkpoint-3750\n","Configuration saved in news-classification-2/checkpoint-3750/config.json\n","Model weights saved in news-classification-2/checkpoint-3750/pytorch_model.bin\n","tokenizer config file saved in news-classification-2/checkpoint-3750/tokenizer_config.json\n","Special tokens file saved in news-classification-2/checkpoint-3750/special_tokens_map.json\n"]},{"output_type":"display_data","data":{"text/html":["\n","    <div>\n","      \n","      <progress value='5625' max='5625' style='width:300px; height:20px; vertical-align: middle;'></progress>\n","      [5625/5625 44:21, Epoch 3/3]\n","    </div>\n","    <table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: left;\">\n","      <th>Epoch</th>\n","      <th>Training Loss</th>\n","      <th>Validation Loss</th>\n","      <th>Accuracy</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <td>1</td>\n","      <td>0.112900</td>\n","      <td>0.097082</td>\n","      <td>0.969800</td>\n","    </tr>\n","    <tr>\n","      <td>2</td>\n","      <td>0.092900</td>\n","      <td>0.090939</td>\n","      <td>0.971050</td>\n","    </tr>\n","    <tr>\n","      <td>3</td>\n","      <td>0.080600</td>\n","      <td>0.089892</td>\n","      <td>0.971900</td>\n","    </tr>\n","  </tbody>\n","</table><p>"],"text/plain":["<IPython.core.display.HTML object>"]},"metadata":{}},{"output_type":"stream","name":"stderr","text":["The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.\n","***** Running Evaluation *****\n","  Num examples = 20000\n","  Batch size = 96\n","Saving model checkpoint to news-classification-2/checkpoint-5625\n","Configuration saved in news-classification-2/checkpoint-5625/config.json\n","Model weights saved in news-classification-2/checkpoint-5625/pytorch_model.bin\n","tokenizer config file saved in news-classification-2/checkpoint-5625/tokenizer_config.json\n","Special tokens file saved in news-classification-2/checkpoint-5625/special_tokens_map.json\n","\n","\n","Training completed. Do not forget to share your model on huggingface.co/models =)\n","\n","\n","Loading best model from news-classification-2/checkpoint-5625 (score: 0.9719).\n"]},{"output_type":"execute_result","data":{"text/plain":["TrainOutput(global_step=5625, training_loss=0.09479788547092013, metrics={'train_runtime': 2662.9885, 'train_samples_per_second': 202.78, 'train_steps_per_second': 2.112, 'total_flos': 2.136703463424e+16, 'train_loss': 0.09479788547092013, 'epoch': 3.0})"]},"metadata":{},"execution_count":16}]},{"cell_type":"code","metadata":{"id":"KfDv-o3Kl1ru"},"source":["#读取测试集\n","import pandas as pd\n","from datasets import load_dataset\n","test_df=pd.read_csv('./test_a.csv',sep='\\t')\n","\n","#将训练数据中三个token换成标点\n","test_df['texts']=test_df['text'].map(lambda x:replacepunc(x))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"CEm7hIZV63zC","colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["0c4490aebf584dfd82d892b6d6595a54","95753358fe9748828b0293744ffd1cb2","a4b51273f8bb4989998f42795db7921c","59b53457765c437fa28e658c2af83817","9eb9d23d3d3c446d9022c83e7a01096b","71fe630676654933a3eab241b26cb215","497dea9e74de49abae0088adf0ab262a","75c2a7e9bbdf49139613ddb2c55281fb","71c6f45697334b269de8dd9121344c4c","9ecdfb7a5b6141df93820b9fe4a00948","3c771948af6e4c2cb4976875e6750818"]},"executionInfo":{"status":"ok","timestamp":1634911808338,"user_tz":-480,"elapsed":37728,"user":{"displayName":"张hongxu","photoUrl":"https://lh3.googleusercontent.com/a/default-user=s64","userId":"01344108933923387301"}},"outputId":"d7925754-c3b4-4126-f560-091af8e6c2c8"},"source":["#数据预处理\n","from datasets import Dataset\n","test_df['summary']=test_df['texts'].apply(lambda x:slipt2(x))\n","\n","#加载到dataset并预处理\n","test_ds=Dataset.from_pandas(test_df).remove_columns([\"texts\",\"text\"])\n","\n","tokenized_test_ds=test_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"0c4490aebf584dfd82d892b6d6595a54","version_minor":0,"version_major":2},"text/plain":["  0%|          | 0/50 [00:00<?, ?ba/s]"]},"metadata":{}}]},{"cell_type":"markdown","metadata":{"id":"hJ_mdP_48V_R"},"source":[""]},{"cell_type":"code","metadata":{"id":"OwR5tJkRSHz-"},"source":["#用trainer预测结果并保存\n","predictions,metrics,Loss=trainer.predict(tokenized_test_ds,metric_key_prefix=\"test\")\n","pred=np.argmax(predictions,axis=1)\n","pd.DataFrame({'label':pred}).to_csv('submit1022.csv',index=None)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"LHNmEhRLTVD8"},"source":[""],"execution_count":null,"outputs":[]}]}